## Warning: package 'choroplethr' was built under R version 4.3.2
## Warning: package 'acs' was built under R version 4.3.2
## Warning: package 'choroplethrMaps' was built under R version 4.3.2
clean names and select necessary variables
rename the variables
drop any rows with missing observations
filter out the second row (contains comments about the columns)
join all the datasets
drop any rows with missing observations
create new column, id, using the geo id
filter to only keep counties relevant to NYC
Crowded households may promote an environment that promotes the nesting of rats due to increased garbage or unhygienic behaviors. There may be increased sightings of rats in these households because there are more people able to spot them.
crowding18 =
read_csv("Data/crowding/2018_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e, s2501_c01_001e) |>
rename(
less_than_1_y18 = s2501_c01_006e,
bet_1to1.5_y18 = s2501_c01_007e,
more_than_1.5_y18 = s2501_c01_008e,
total_homes_y18 = s2501_c01_001e,
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y18","bet_1to1.5_y18","more_than_1.5_y18", "total_homes_y18"), as.numeric) |>
mutate(
less_prop_y18 = less_than_1_y18/total_homes_y18,
bet_prop_y18 = bet_1to1.5_y18/total_homes_y18,
more_prop_y18 = more_than_1.5_y18/total_homes_y18,
)
crowding19 =
read_csv("Data/crowding/2019_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e, s2501_c01_001e) |>
rename(
less_than_1_y19 = s2501_c01_006e,
bet_1to1.5_y19 = s2501_c01_007e,
more_than_1.5_y19 = s2501_c01_008e,
total_homes_y19 = s2501_c01_001e,
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y19","bet_1to1.5_y19","more_than_1.5_y19", "total_homes_y19"), as.numeric) |>
mutate(
less_prop_y19 = less_than_1_y19/total_homes_y19,
bet_prop_y19 = bet_1to1.5_y19/total_homes_y19,
more_prop_y19 = more_than_1.5_y19/total_homes_y19,
)
crowding20 =
read_csv("Data/crowding/2020_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e, s2501_c01_001e) |>
rename(
less_than_1_y20 = s2501_c01_006e,
bet_1to1.5_y20 = s2501_c01_007e,
more_than_1.5_y20 = s2501_c01_008e,
total_homes_y20 = s2501_c01_001e,
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y20","bet_1to1.5_y20","more_than_1.5_y20", "total_homes_y20"), as.numeric) |>
mutate(
less_prop_y20 = less_than_1_y20/total_homes_y20,
bet_prop_y20 = bet_1to1.5_y20/total_homes_y20,
more_prop_y20 = more_than_1.5_y20/total_homes_y20,
)
crowding21 =
read_csv("Data/crowding/2021_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e, s2501_c01_001e) |>
rename(
less_than_1_y21 = s2501_c01_006e,
bet_1to1.5_y21 = s2501_c01_007e,
more_than_1.5_y21 = s2501_c01_008e,
total_homes_y21 = s2501_c01_001e,
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y21","bet_1to1.5_y21","more_than_1.5_y21", "total_homes_y21"), as.numeric) |>
mutate(
less_prop_y21 = less_than_1_y21/total_homes_y21,
bet_prop_y21 = bet_1to1.5_y21/total_homes_y21,
more_prop_y21 = more_than_1.5_y21/total_homes_y21,
)
crowding_10s =
full_join(crowding18, crowding19, by = "geo_id")
crowding_20s =
full_join(crowding20, crowding21, by = "geo_id")
crowding_all =
full_join(crowding_10s, crowding_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County",
" New York County"))
Prior research in other cities has shown an association between low education and rat sightings. We hypothesized that individuals with lower education may be unable to access newly renovated, or well-maintained housing units. As a result, they may be more likely to come into contact with rats or live in areas that contains large numbers of the rat population.
edu18 =
read_csv("Data/edu/edu_2018.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e, s1501_c01_006e) |>
rename(
less_9_y18 = s1501_c01_007e,
no_hs_diploma_y18 = s1501_c01_008e,
hs_grad_y18 = s1501_c01_009e,
some_college_y18 = s1501_c01_010e,
associate_y18 = s1501_c01_011e,
bachelor_y18 = s1501_c01_012e,
graduate_y18 = s1501_c01_013e,
total_pop_y18 = s1501_c01_006e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y18","no_hs_diploma_y18","hs_grad_y18","some_college_y18","associate_y18",
"bachelor_y18","graduate_y18", "total_pop_y18"), as.numeric) |>
mutate(
hs_or_less_y18 = (rowSums(across(c(less_9_y18,no_hs_diploma_y18,hs_grad_y18))))/total_pop_y18,
college_y18 = (rowSums(across(c(some_college_y18,associate_y18,bachelor_y18,graduate_y18))))/total_pop_y18
)
edu19 =
read_csv("Data/edu/edu_2019.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e, s1501_c01_006e) |>
rename(
less_9_y19 = s1501_c01_007e,
no_hs_diploma_y19 = s1501_c01_008e,
hs_grad_y19 = s1501_c01_009e,
some_college_y19 = s1501_c01_010e,
associate_y19 = s1501_c01_011e,
bachelor_y19 = s1501_c01_012e,
graduate_y19 = s1501_c01_013e,
total_pop_y19 = s1501_c01_006e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y19","no_hs_diploma_y19","hs_grad_y19","some_college_y19","associate_y19",
"bachelor_y19","graduate_y19", "total_pop_y19"), as.numeric) |>
mutate(
hs_or_less_y19 = (rowSums(across(c(less_9_y19,no_hs_diploma_y19,hs_grad_y19))))/total_pop_y19,
college_y19 = (rowSums(across(c(some_college_y19,associate_y19,bachelor_y19,graduate_y19))))/total_pop_y19
)
edu20 =
read_csv("Data/edu/edu_2020.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e, s1501_c01_006e) |>
rename(
less_9_y20 = s1501_c01_007e,
no_hs_diploma_y20 = s1501_c01_008e,
hs_grad_y20 = s1501_c01_009e,
some_college_y20 = s1501_c01_010e,
associate_y20 = s1501_c01_011e,
bachelor_y20 = s1501_c01_012e,
graduate_y20 = s1501_c01_013e,
total_pop_y20 = s1501_c01_006e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y20","no_hs_diploma_y20","hs_grad_y20","some_college_y20","associate_y20",
"bachelor_y20","graduate_y20", "total_pop_y20"), as.numeric) |>
mutate(
hs_or_less_y20 = (rowSums(across(c(less_9_y20,no_hs_diploma_y20,hs_grad_y20))))/total_pop_y20,
college_y20 = (rowSums(across(c(some_college_y20,associate_y20,bachelor_y20,graduate_y20))))/total_pop_y20
)
edu21 =
read_csv("Data/edu/edu_2021.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e, s1501_c01_006e) |>
rename(
less_9_y21 = s1501_c01_007e,
no_hs_diploma_y21 = s1501_c01_008e,
hs_grad_y21 = s1501_c01_009e,
some_college_y21 = s1501_c01_010e,
associate_y21 = s1501_c01_011e,
bachelor_y21 = s1501_c01_012e,
graduate_y21 = s1501_c01_013e,
total_pop_y21 = s1501_c01_006e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y21","no_hs_diploma_y21","hs_grad_y21","some_college_y21","associate_y21",
"bachelor_y21","graduate_y21", "total_pop_y21"), as.numeric) |>
mutate(
hs_or_less_y21 = (rowSums(across(c(less_9_y21,no_hs_diploma_y21,hs_grad_y21))))/total_pop_y21,
college_y21 = (rowSums(across(c(some_college_y21,associate_y21,bachelor_y21,graduate_y21))))/total_pop_y21
)
edu_10s =
full_join(edu18, edu19, by = "geo_id")
edu_20s =
full_join(edu20, edu21, by = "geo_id")
edu_all =
full_join(edu_10s, edu_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County",
" New York County"))
Lower-income areas might have fewer resources/organizations for building maintenance and rodent control which in turn creates an environment that can sustain the rat population. On the other hand, higher-income areas might be more likely to report rat sightings.
poverty18 =
read_csv("Data/poverty/2018_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, s1701_c03_001e) |>
rename(
below_poverty_y18 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y18"), as.numeric) |>
mutate(prop_poverty_y18 = below_poverty_y18/100)
poverty19 =
read_csv("Data/poverty/2019_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1701_c03_001e) |>
rename(
below_poverty_y19 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y19"), as.numeric) |>
mutate(prop_poverty_y19 = below_poverty_y19/100)
poverty20 =
read_csv("Data/poverty/2020_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1701_c03_001e) |>
rename(
below_poverty_y20 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y20"), as.numeric) |>
mutate(prop_poverty_y20 = below_poverty_y20/100)
poverty21 =
read_csv("Data/poverty/2021_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1701_c03_001e) |>
rename(
below_poverty_y21 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y21"), as.numeric) |>
mutate(prop_poverty_y21 = below_poverty_y21/100)
poverty_10s =
full_join(poverty18, poverty19, by = "geo_id")
poverty_20s =
full_join(poverty20, poverty21, by = "geo_id")
poverty_all =
full_join(poverty_10s, poverty_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County",
" New York County"))
Empty households can be conducive to supporting the rat population since with no people are actively living there, there is no one reporting the rats or attempting to control the size of the population. It allows the rats to nest and hide away from severe environmental conditions (rain, snow) and predators.
vacancy18 =
read_csv("Data/vacancy/2018_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, b25004_001e, b25004_008e) |>
rename(
total_home_y18 = b25004_001e,
vacant_y18 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y18","vacant_y18"), as.numeric) |>
mutate(prop_vacant_y18 = vacant_y18/total_home_y18)
vacancy19 =
read_csv("Data/vacancy/2019_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, b25004_001e, b25004_008e) |>
rename(
total_home_y19 = b25004_001e,
vacant_y19 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y19","vacant_y19"), as.numeric) |>
mutate(prop_vacant_y19 = vacant_y19/total_home_y19)
vacancy20 =
read_csv("Data/vacancy/2020_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, b25004_001e, b25004_008e) |>
rename(
total_home_y20 = b25004_001e,
vacant_y20 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y20","vacant_y20"), as.numeric) |>
mutate(prop_vacant_y20 = vacant_y20/total_home_y20)
vacancy21 =
read_csv("Data/vacancy/2021_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, b25004_001e, b25004_008e) |>
rename(
total_home_y21 = b25004_001e,
vacant_y21 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y21","vacant_y21"), as.numeric) |>
mutate(prop_vacant_y21 = vacant_y21/total_home_y21)
vacancy_10s =
full_join(vacancy18, vacancy19, by = "geo_id")
vacancy_20s =
full_join(vacancy20, vacancy21, by = "geo_id")
vacancy_all =
full_join(vacancy_10s, vacancy_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County",
" New York County"))
It should be noted that the dataset included information about other types of properties but were unoccupied such as rental properties, those listed for sale, properties sold but unoccupied, those designated for migrant workers, and residences that were used occasionally/seasonally/recreationally. The only variable considered in this analysis was vacant properties however it is likely that residences that go unoccupied for several months can become nests for the rat population, increasing sightings in those areas. A more thorough and in-depth analysis should include the other variables provided in the dataset.
In general, we might be losing information with the variables chosen for analysis. During data collection, the variables were further subsetted into age groups and race/ethnicity designations. However for this analysis, the general estimates were taken for the population as a whole in NYC, therefore we did not analyze any potential disparities between race or age groups.
When importing data from 2018 and 2019, there were 4918 census tracts and when importing data from 2020 and 2021, there were 5411 census tracts. The datasets for each year were joined together by geography id to only retain census tracts that were consistent across the years. There may have been adjustments or redefining areas but that could affect analysis as well.
crowding_time = crowding_all |>
pivot_longer(less_than_1_y18:more_prop_y21,
names_to = "occupancy",
values_to = "count") |>
mutate(year = str_sub(occupancy, -3),
category = str_sub(occupancy, end=-5)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
mutate(year = as.factor(year)) |>
filter(category == "more_prop") |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Proportion of Households with 1.51 or More <br> Occupants/Room by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Proportion of Housing Units'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
crowding_time
Over the observed years in the dataset, New York County (Manhattan) appeared to be the only county with a noticeable reduction in the average proportion of housing units with 1.51 or more occupants per room. Kings (Brooklyn) and Queens County had a slight increase while Bronx County had a slight decrease between years 2020 and 2021. Compared to Queens and Kings County, Richmond County (Staten Island) had the largest increase after the pandemic. Overall, Bronx County had the highest average proportion, 0.047, of households with 1.51 or more occupants per room at the end of 2021 while Richmond (Staten Island) County had the lowest (0.021).
edu_time = edu_all |>
pivot_longer(less_9_y18:college_y21,
names_to = "education_level",
values_to = "count") |>
mutate(year = str_sub(education_level, -3),
category = str_sub(education_level, end=-5)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
mutate(year = as.factor(year)) |>
filter(category == "hs_or_less") |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Proportion of Individuals with HS Education Level <br> or less by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Number of Individuals'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
edu_time
Over the observed years in the dataset, all of the counties, except Richmond (Staten Island), had a decrease in the average proportion of individuals with a high school education or less. Richmond County appeared to be static over the years, with a potentially slight increase by the end of 2021. Overall, Bronx County had the highest average proportion, 0.531, of individuals ages 25 years and older with a high school education or less at the end of 2021 while New York (Manhattan) County had the lowest (0.242).
poverty_time = poverty_all |>
pivot_longer(below_poverty_y18:prop_poverty_y21,
names_to = "poverty_level",
values_to = "count") |>
mutate(year = str_sub(poverty_level, -3),
category = str_sub(poverty_level, end=-5)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
mutate(year = as.factor(year)) |>
filter(category == "prop_poverty") |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Proportion of Households Below <br> the Poverty Level by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Proportion of Households Below Poverty Level'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
poverty_time
Over the observed years in the dataset, all of the counties had a decrease in the average proportion of households below the poverty level, with the most significant decrease occurring in the Bronx from 2018 to 2021. Overall, Bronx County had the highest average proportion, 0.257, of households below the poverty line among the population for which poverty status was determined at the end of 2021 while Queens County had the lowest (0.109).
vacancy_time = vacancy_all |>
pivot_longer(total_home_y18:prop_vacant_y21,
names_to = "vacancy",
values_to = "count") |>
mutate(year = str_sub(vacancy, -3),
category = str_sub(vacancy, end=-5)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
mutate(year = as.factor(year)) |>
filter(category == "prop_vacant") |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Proportion of Vacant Properties by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Proportion'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
vacancy_time
Over the observed years in the dataset, Bronx and Richmond (Staten Island) County dipped during 2020 and then increased again in the average proportion of vacant properties at the end of 2021. On the other hand, Queens, Kings (Brooklyn), and New York (Manhattan) County peaked during 2020, and then decreased in the average proportion of vacant properties at the end of 2021. Overall, Queens County had the highest average proportion, 0.503, of vacant properties for residential purposes at the end of 2021 while New York (Manhattan) County had the lowest (0.242).
Bronx (highest), Richmond (lowest)
Bronx_crowding_chp
Staten_crowding_chp
Analysis
Bronx (highest), New York (lowest)
Bronx_edu_chp
Manhattan_edu_chp
Analysis
Bronx (highest), Queens (lowest)
Bronx_poverty_chp
Queens_poverty_chp
Analysis
Queens (highest), New York (lowest)
Queens_vacancy_chp
Manhattan_vacancy_chp
Analysis
kings county (brooklyn) - 36047 bronx - 36005 queens - 36081 richmond (staten island) - 36085 new york (manhattan) - 36061
Rat Sighting Distribution by Borough:
explain how each of these variables may be related to rat sightings and increased presence, go back to the rat paper possibly compare to the rat distribution
upload everything to shared repository